/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * NumericToNominal.java
 * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
 */

package weka.filters.unsupervised.attribute;

import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.SimpleBatchFilter;

/**
 <!-- globalinfo-start -->
 * A filter for turning numeric attributes into nominal ones. Unlike discretization, it just takes all numeric values and adds them to the list of nominal values of that attribute. Useful after CSV imports, to enforce certain attributes to become nominal, e.g., the class attribute, containing values from 1 to 5.
 * <p/>
 <!-- globalinfo-end -->
 * 
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -R &lt;col1,col2-col4,...&gt;
 *  Specifies list of columns to Discretize. First and last are valid indexes.
 *  (default: first-last)</pre>
 * 
 * <pre> -V
 *  Invert matching sense of column indexes.</pre>
 * 
 <!-- options-end -->
 *
 * @author  fracpete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 8575 $
 */
public class NumericToNominal
  extends SimpleBatchFilter {

  /** for serialization */
  private static final long serialVersionUID = -6614630932899796239L;

  /** the maximum number of decimals to use */
  protected final static int MAX_DECIMALS = 6;
  
  /** Stores which columns to turn into nominals */
  protected Range m_Cols = new Range("first-last");

  /** The default columns to turn into nominals */
  protected String m_DefaultCols = "first-last";

  /**
   * Returns a string describing this filter
   *
   * @return 		a description of the filter suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return 
        "A filter for turning numeric attributes into nominal ones. Unlike "
      + "discretization, it just takes all numeric values and adds them to "
      + "the list of nominal values of that attribute. Useful after CSV "
      + "imports, to enforce certain attributes to become nominal, e.g., "
      + "the class attribute, containing values from 1 to 5.";
  }

  /**
   * Gets an enumeration describing the available options.
   *
   * @return 		an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector result = new Vector();

    result.addElement(new Option(
	"\tSpecifies list of columns to Discretize. First"
	+ " and last are valid indexes.\n"
	+ "\t(default: first-last)",
	"R", 1, "-R <col1,col2-col4,...>"));

    result.addElement(new Option(
	"\tInvert matching sense of column indexes.",
	"V", 0, "-V"));

    return result.elements();
  }

  /**
   * Parses a given list of options. <p/>
   * 
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -R &lt;col1,col2-col4,...&gt;
   *  Specifies list of columns to Discretize. First and last are valid indexes.
   *  (default: first-last)</pre>
   * 
   * <pre> -V
   *  Invert matching sense of column indexes.</pre>
   * 
   <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    String	tmpStr;

    super.setOptions(options);
    
    setInvertSelection(Utils.getFlag('V', options));

    tmpStr = Utils.getOption('R', options);
    if (tmpStr.length() != 0)
      setAttributeIndices(tmpStr);
    else
      setAttributeIndices(m_DefaultCols);

    if (getInputFormat() != null)
      setInputFormat(getInputFormat());
  }

  /**
   * Gets the current settings of the filter.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String[] getOptions() {
    int       i;
    Vector    result;
    String[]  options;

    result = new Vector();
    options = super.getOptions();
    for (i = 0; i < options.length; i++)
      result.add(options[i]);

    if (!getAttributeIndices().equals("")) {
      result.add("-R");
      result.add(getAttributeIndices());
    }

    if (getInvertSelection())
      result.add("-V");

    return (String[]) result.toArray(new String[result.size()]);	  
  }

  /**
   * Returns the tip text for this property
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String invertSelectionTipText() {
    return 
        "Set attribute selection mode. If false, only selected"
      + " (numeric) attributes in the range will be 'nominalized'; if"
      + " true, only non-selected attributes will be 'nominalized'.";
  }

  /**
   * Gets whether the supplied columns are to be worked on or the others.
   *
   * @return 		true if the supplied columns will be worked on
   */
  public boolean getInvertSelection() {
    return m_Cols.getInvert();
  }

  /**
   * Sets whether selected columns should be worked on or all the others apart
   * from these. If true all the other columns are considered for 
   * "nominalization".
   *
   * @param value 	the new invert setting
   */
  public void setInvertSelection(boolean value) {
    m_Cols.setInvert(value);
  }

  /**
   * Returns the tip text for this property
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String attributeIndicesTipText() {
    return "Specify range of attributes to act on."
      + " This is a comma separated list of attribute indices, with"
      + " \"first\" and \"last\" valid values. Specify an inclusive"
      + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
  }

  /**
   * Gets the current range selection
   *
   * @return 		a string containing a comma separated list of ranges
   */
  public String getAttributeIndices() {
    return m_Cols.getRanges();
  }

  /**
   * Sets which attributes are to be "nominalized" (only numeric
   * attributes among the selection will be transformed).
   *
   * @param value 	a string representing the list of attributes. Since
   * 			the string will typically come from a user, attributes 
   * 			are indexed from 1. <br> eg: first-3,5,6-last
   * @throws IllegalArgumentException if an invalid range list is supplied 
   */
  public void setAttributeIndices(String value) {
    m_Cols.setRanges(value);
  }

  /**
   * Sets which attributes are to be transoformed to nominal. (only numeric
   * attributes among the selection will be transformed).
   *
   * @param value 	an array containing indexes of attributes to nominalize.
   * 			Since the array will typically come from a program, 
   * 			attributes are indexed from 0.
   * @throws IllegalArgumentException if an invalid set of ranges is supplied 
   */
  public void setAttributeIndicesArray(int[] value) {
    setAttributeIndices(Range.indicesToRangeList(value));
  }

  /** 
   * Returns the Capabilities of this filter.
   *
   * @return            the capabilities of this object
   * @see               Capabilities
   */
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();

    // attributes
    result.enableAllAttributes();
    result.enable(Capability.MISSING_VALUES);
    
    // class
    result.enableAllClasses();
    result.enable(Capability.MISSING_CLASS_VALUES);
    result.enable(Capability.NO_CLASS);
    
    return result;
  }

  /**
   * Determines the output format based on the input format and returns 
   * this. In case the output format cannot be returned immediately, i.e.,
   * immediateOutputFormat() returns false, then this method will be called
   * from batchFinished().
   *
   * @param inputFormat     the input format to base the output format on
   * @return                the output format
   * @throws Exception      in case the determination goes wrong
   * @see   #hasImmediateOutputFormat()
   * @see   #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat)
      throws Exception {
    
    Instances 	data;
    Instances	result;
    FastVector	atts;
    FastVector	values;
    HashSet	hash;
    int		i;
    int		n;
    boolean	isDate;
    Instance	inst;
    Vector	sorted;

    m_Cols.setUpper(inputFormat.numAttributes() - 1);
    data = new Instances(inputFormat);
    atts = new FastVector();
    for (i = 0; i < data.numAttributes(); i++) {
      if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) {
	atts.addElement(data.attribute(i));
	continue;
      }
      
      // date attribute?
      isDate = (data.attribute(i).type() == Attribute.DATE);
      
      // determine all available attribtues in dataset
      hash   = new HashSet();
      for (n = 0; n < data.numInstances(); n++) {
	inst = data.instance(n);
	if (inst.isMissing(i))
	  continue;
	
	if (isDate)
	  hash.add(inst.stringValue(i));
	else
	  hash.add(new Double(inst.value(i)));
      }
      
      // sort values
      sorted = new Vector();
      for (Object o: hash)
	sorted.add(o);
      Collections.sort(sorted);
      
      // create attribute from sorted values
      values = new FastVector();
      for (Object o: sorted) {
	if (isDate)
	  values.addElement(
	      o.toString());
	else
	  values.addElement(
	      Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS));
      }
      Attribute newAtt = new Attribute(data.attribute(i).name(), values);
      newAtt.setWeight(data.attribute(i).weight());
      atts.addElement(newAtt);
    }
    
    result = new Instances(inputFormat.relationName(), atts, 0);
    result.setClassIndex(inputFormat.classIndex());
    
    return result;
  }

  /**
   * Processes the given data (may change the provided dataset) and returns
   * the modified version. This method is called in batchFinished().
   *
   * @param instances   the data to process
   * @return            the modified data
   * @throws Exception  in case the processing goes wrong
   * @see               #batchFinished()
   */
  public Instances process(Instances instances) throws Exception {
    Instances	result;
    int		i;
    int		n;
    double[]	values;
    String	value;
    Instance	inst;
    Instance	newInst;
    
    // we need the complete input data!
    if (!isFirstBatchDone())
      setOutputFormat(determineOutputFormat(getInputFormat()));
    
    result = new Instances(getOutputFormat());
    
    for (i = 0; i < instances.numInstances(); i++) {
      inst   = instances.instance(i);
      values = inst.toDoubleArray();
      
      for (n = 0; n < values.length; n++) {
	if (    !m_Cols.isInRange(n)
	     || !instances.attribute(n).isNumeric() 
	     || inst.isMissing(n) )
	  continue;

	// get index of value
	if (instances.attribute(n).type() == Attribute.DATE)
	  value = inst.stringValue(n);
	else
	  value = Utils.doubleToString(inst.value(n), MAX_DECIMALS);
	
	values[n] = result.attribute(n).indexOfValue(value);
      }
      
      // generate new instance
      if (inst instanceof SparseInstance)
	newInst = new SparseInstance(inst.weight(), values);
      else
	newInst = new DenseInstance(inst.weight(), values);
      
      // copy possible string, relational values
      newInst.setDataset(getOutputFormat());
      copyValues(newInst, false, inst.dataset(), getOutputFormat());
      
      result.add(newInst);
    }
    
    return result;
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 8575 $");
  }

  /**
   * Runs the filter with the given parameters. Use -h to list options.
   * 
   * @param args	the commandline options
   */
  public static void main(String[] args) {
    runFilter(new NumericToNominal(), args);
  }
}
